{
    "cells": [
        {
            "cell_type": "markdown",
            "id": "7a9f093e-e027-405b-ae3d-17dda9e30cd0",
            "metadata": {},
            "source": [
                "# NYC Wikipedia Embeddings Demo"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "cadae9f2",
            "metadata": {},
            "outputs": [],
            "source": [
                "import logging\n",
                "import sys\n",
                "\n",
                "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
                "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "3e594a62-110e-40b3-ad1e-c99f49a4e537",
            "metadata": {},
            "source": [
                "Demonstrate embedding capabilities in TreeIndex and SummaryIndex"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "b145f093-afb0-46b8-a81f-466af8478439",
            "metadata": {},
            "source": [
                "### Setup + Data Prep"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "d038dcc1",
            "metadata": {},
            "outputs": [],
            "source": [
                "import logging\n",
                "import sys\n",
                "\n",
                "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
                "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38",
            "metadata": {},
            "outputs": [],
            "source": [
                "# fetch \"New York City\" page from Wikipedia\n",
                "from pathlib import Path\n",
                "\n",
                "import requests\n",
                "\n",
                "response = requests.get(\n",
                "    \"https://en.wikipedia.org/w/api.php\",\n",
                "    params={\n",
                "        \"action\": \"query\",\n",
                "        \"format\": \"json\",\n",
                "        \"titles\": \"New York City\",\n",
                "        \"prop\": \"extracts\",\n",
                "        # 'exintro': True,\n",
                "        \"explaintext\": True,\n",
                "    },\n",
                ").json()\n",
                "page = next(iter(response[\"query\"][\"pages\"].values()))\n",
                "nyc_text = page[\"extract\"]\n",
                "\n",
                "data_path = Path(\"data\")\n",
                "if not data_path.exists():\n",
                "    Path.mkdir(data_path)\n",
                "\n",
                "with open(\"data/nyc_text.txt\", \"w\") as fp:\n",
                "    fp.write(nyc_text)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd",
            "metadata": {},
            "outputs": [],
            "source": [
                "# My OpenAI Key\n",
                "import os\n",
                "\n",
                "os.environ[\"OPENAI_API_KEY\"] = \"INSERT OPENAI KEY\""
            ]
        },
        {
            "cell_type": "markdown",
            "id": "def4eca7-ba03-48e2-b18f-fd669b91a5fc",
            "metadata": {},
            "source": [
                "### TreeIndex - Embedding-based Query"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
            "metadata": {},
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
                    ]
                }
            ],
            "source": [
                "from llama_index import TreeIndex, SimpleDirectoryReader\n",
                "from IPython.display import Markdown"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "documents = SimpleDirectoryReader(\"data\").load_data()\n",
                "index = TreeIndex.from_documents(documents)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "query_engine = index.as_query_engine(retriever_mode=\"embedding\")\n",
                "response = query_engine.query(\n",
                "    \"What is the name of the professional women's basketball team in New York City?\"\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "e1000018-18de-410d-b6d9-c66bf37ccf1d",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf",
            "metadata": {},
            "outputs": [],
            "source": [
                "response = query_engine.query(\n",
                "    \"What battles took place in New York City in the American Revolution?\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "5588289b-9fdc-4b86-bab9-808c97be05e1",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "53265fd4-da98-4cf9-abfb-3f76105fd2ff",
            "metadata": {},
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "response = query_engine.query(\"What are the airports in New York City?\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "bc08060f-b031-4dc5-a980-427dd2407b5d",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "63009734-deda-4159-9f2b-0af19720e913",
            "metadata": {},
            "source": [
                "### SummaryIndex - Embedding-based Query"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "fd8920ae-8115-457c-b092-21e50cc3bcc0",
            "metadata": {},
            "outputs": [],
            "source": [
                "from llama_index import SummaryIndex, SimpleDirectoryReader\n",
                "from IPython.display import Markdown"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "27c8bbee-daf5-494d-ba66-b60142592a96",
            "metadata": {},
            "outputs": [],
            "source": [
                "documents = SimpleDirectoryReader(\"data\").load_data()\n",
                "index = SummaryIndex.from_documents(documents)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "2cbf24c2-060e-4216-9188-a6746af1830d",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "query_engine = index.as_query_engine(retriever_mode=\"embedding\")\n",
                "response = query_engine.query(\n",
                "    \"What is the name of the professional women's basketball team in New York City?\"\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "14e1b19f-fbf7-49fd-a96f-cbb37bafd498",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "48b86c8d-9149-4395-9d52-6070597c814d",
            "metadata": {},
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "response = query_engine.query(\n",
                "    \"What battles took place in New York City in the American Revolution?\",\n",
                "    retriever_mode=\"embedding\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "57fbd90c-a8d3-4738-8531-e8f48a953167",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "7ab01446-9b07-4222-a577-eeb4617ce4fc",
            "metadata": {},
            "outputs": [],
            "source": [
                "# set Logging to DEBUG for more detailed outputs\n",
                "response = query_engine.query(\n",
                "    \"What are the airports in New York City?\", retriever_mode=\"embedding\"\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "091afaea-a61e-4a7c-b2f1-7df387380b8b",
            "metadata": {},
            "outputs": [],
            "source": [
                "display(Markdown(f\"<b>{response}</b>\"))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "aca03087-d6cc-4d87-8ec6-185fa03d9fea",
            "metadata": {},
            "source": [
                "## Try out other embeddings! \n",
                "(courtesy of langchain)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "27c24411-7049-45c7-862c-0857c03db580",
            "metadata": {},
            "outputs": [],
            "source": [
                "from llama_index import SummaryIndex, SimpleDirectoryReader, ServiceContext\n",
                "from IPython.display import Markdown"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "b9ff1944-a06a-4b05-adae-a2ef25e74e8b",
            "metadata": {},
            "outputs": [],
            "source": [
                "# load in HF embedding model from langchain\n",
                "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n",
                "from llama_index import LangchainEmbedding\n",
                "\n",
                "embed_model = LangchainEmbedding(HuggingFaceEmbeddings())"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "1494cabb-0123-408a-9d81-8e02db9b3acd",
            "metadata": {},
            "outputs": [],
            "source": [
                "# configure\n",
                "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n",
                "\n",
                "# set Logging to DEBUG for more detailed outputs\n",
                "query_engine = index.as_query_engine(\n",
                "    retriever_mode=\"embedding\",\n",
                "    service_context=service_context,\n",
                ")\n",
                "response = query_engine.query(\n",
                "    \"What is the name of the professional women's basketball team in New York City?\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "4d96a2e7-4eb1-474e-b855-eca3efed1bad",
            "metadata": {},
            "outputs": [],
            "source": [
                "response"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "80510d3a-8bf8-47f2-b1d4-3d1bd0d5a1bb",
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3 (ipykernel)",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.10.9"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
}
